<!DOCTYPE html>
<html lang="en-us">
  <head>

    <meta http-equiv="content-type" content="text/html; charset=utf-8">
    
<meta charset="UTF-8">
<title>Living in a Unicode World | Elasticsearch: The Definitive Guide [2.x] | Elastic</title>
<link rel="home" href="index.html" title="Elasticsearch: The Definitive Guide [2.x]">
<link rel="up" href="token-normalization.html" title="Normalizing Tokens">
<link rel="prev" href="asciifolding-token-filter.html" title="You Have an Accent">
<link rel="next" href="case-folding.html" title="Unicode Case Folding">
<meta name="DC.type" content="Learn/Docs/Legacy/Elasticsearch/Definitive Guide/2.x">
<meta name="DC.subject" content="Elasticsearch">
<meta name="DC.identifier" content="2.x">
<meta name="robots" content="noindex,nofollow">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <script src="https://cdn.optimizely.com/js/18132920325.js"></script>
    <link rel="apple-touch-icon" sizes="57x57" href="/apple-icon-57x57.png">
    <link rel="apple-touch-icon" sizes="60x60" href="/apple-icon-60x60.png">
    <link rel="apple-touch-icon" sizes="72x72" href="/apple-icon-72x72.png">
    <link rel="apple-touch-icon" sizes="76x76" href="/apple-icon-76x76.png">
    <link rel="apple-touch-icon" sizes="114x114" href="/apple-icon-114x114.png">
    <link rel="apple-touch-icon" sizes="120x120" href="/apple-icon-120x120.png">
    <link rel="apple-touch-icon" sizes="144x144" href="/apple-icon-144x144.png">
    <link rel="apple-touch-icon" sizes="152x152" href="/apple-icon-152x152.png">
    <link rel="apple-touch-icon" sizes="180x180" href="/apple-icon-180x180.png">
    <link rel="icon" type="image/png" href="/favicon-32x32.png" sizes="32x32">
    <link rel="icon" type="image/png" href="/android-chrome-192x192.png" sizes="192x192">
    <link rel="icon" type="image/png" href="/favicon-96x96.png" sizes="96x96">
    <link rel="icon" type="image/png" href="/favicon-16x16.png" sizes="16x16">
    <link rel="manifest" href="/manifest.json">
    <meta name="apple-mobile-web-app-title" content="Elastic">
    <meta name="application-name" content="Elastic">
    <meta name="msapplication-TileColor" content="#ffffff">
    <meta name="msapplication-TileImage" content="/mstile-144x144.png">
    <meta name="theme-color" content="#ffffff">
    <meta name="naver-site-verification" content="936882c1853b701b3cef3721758d80535413dbfd">
    <meta name="yandex-verification" content="d8a47e95d0972434">
    <meta name="localized" content="true">
    <meta name="st:robots" content="follow,index">
    <meta property="og:image" content="https://www.elastic.co/static/images/elastic-logo-200.png">
    <link rel="shortcut icon" href="/favicon.ico" type="image/x-icon">
    <link rel="icon" href="/favicon.ico" type="image/x-icon">
    <link rel="apple-touch-icon-precomposed" sizes="64x64" href="/favicon_64x64_16bit.png">
    <link rel="apple-touch-icon-precomposed" sizes="32x32" href="/favicon_32x32.png">
    <link rel="apple-touch-icon-precomposed" sizes="16x16" href="/favicon_16x16.png">
    <!-- Give IE8 a fighting chance -->
    <!--[if lt IE 9]>
    <script src="https://oss.maxcdn.com/html5shiv/3.7.2/html5shiv.min.js"></script>
    <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
    <![endif]-->
    <link rel="stylesheet" type="text/css" href="/guide/static/styles.css">
  </head>

  <!--© 2015-2021 Elasticsearch B.V. Copying, publishing and/or distributing without written permission is strictly prohibited.-->

  <body>
    <!-- Google Tag Manager -->
    <script>dataLayer = [];</script><noscript><iframe src="//www.googletagmanager.com/ns.html?id=GTM-58RLH5" height="0" width="0" style="display:none;visibility:hidden"></iframe></noscript>
    <script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start': new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0], j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= '//www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f); })(window,document,'script','dataLayer','GTM-58RLH5');</script>
    <!-- End Google Tag Manager -->

    <!-- Global site tag (gtag.js) - Google Analytics -->
    <script async src="https://www.googletagmanager.com/gtag/js?id=UA-12395217-16"></script>
    <script>
      window.dataLayer = window.dataLayer || [];
      function gtag(){dataLayer.push(arguments);}
      gtag('js', new Date());
      gtag('config', 'UA-12395217-16');
    </script>

    <!--BEGIN QUALTRICS WEBSITE FEEDBACK SNIPPET-->
    <script type="text/javascript">
      (function(){var g=function(e,h,f,g){
      this.get=function(a){for(var a=a+"=",c=document.cookie.split(";"),b=0,e=c.length;b<e;b++){for(var d=c[b];" "==d.charAt(0);)d=d.substring(1,d.length);if(0==d.indexOf(a))return d.substring(a.length,d.length)}return null};
      this.set=function(a,c){var b="",b=new Date;b.setTime(b.getTime()+6048E5);b="; expires="+b.toGMTString();document.cookie=a+"="+c+b+"; path=/; "};
      this.check=function(){var a=this.get(f);if(a)a=a.split(":");else if(100!=e)"v"==h&&(e=Math.random()>=e/100?0:100),a=[h,e,0],this.set(f,a.join(":"));else return!0;var c=a[1];if(100==c)return!0;switch(a[0]){case "v":return!1;case "r":return c=a[2]%Math.floor(100/c),a[2]++,this.set(f,a.join(":")),!c}return!0};
      this.go=function(){if(this.check()){var a=document.createElement("script");a.type="text/javascript";a.src=g;document.body&&document.body.appendChild(a)}};
      this.start=function(){var a=this;window.addEventListener?window.addEventListener("load",function(){a.go()},!1):window.attachEvent&&window.attachEvent("onload",function(){a.go()})}};
      try{(new g(100,"r","QSI_S_ZN_emkP0oSe9Qrn7kF","https://znemkp0ose9qrn7kf-elastic.siteintercept.qualtrics.com/WRSiteInterceptEngine/?Q_ZID=ZN_emkP0oSe9Qrn7kF")).start()}catch(i){}})();
    </script><div id="ZN_emkP0oSe9Qrn7kF"><!--DO NOT REMOVE-CONTENTS PLACED HERE--></div>
    <!--END WEBSITE FEEDBACK SNIPPET-->

    <div id="elastic-nav" style="display:none;"></div>
    <script src="https://www.elastic.co/elastic-nav.js"></script>

    <!-- Subnav -->
    <div>
      <div>
        <div class="tertiary-nav d-none d-md-block">
          <div class="container">
            <div class="p-t-b-15 d-flex justify-content-between nav-container">
              <div class="breadcrum-wrapper"><span><a href="/guide/" style="font-size: 14px; font-weight: 600; color: #000;">Docs</a></span></div>
            </div>
          </div>
        </div>
      </div>
    </div>

    <div class="main-container">
      <section id="content">
        <div class="content-wrapper">

          <section id="guide" lang="en">
            <div class="container">
              <div class="row">
                <div class="col-xs-12 col-sm-8 col-md-8 guide-section">
                  <!-- start body -->
                  <div class="page_header">
<p>
  <strong>WARNING</strong>: The 2.x versions of Elasticsearch have passed their
  <a href="https://www.elastic.co/support/eol">EOL dates</a>. If you are running
  a 2.x version, we strongly advise you to upgrade.
</p>
<p>
  This documentation is no longer maintained and may be removed. For the latest
  information, see the <a href="https://www.elastic.co/guide/en/elasticsearch/reference/current/index.html">current
  Elasticsearch documentation</a>.
</p>
</div>
<div id="content">
<div class="breadcrumbs">
<span class="breadcrumb-link"><a href="index.html">Elasticsearch: The Definitive Guide [2.x]</a></span>
»
<span class="breadcrumb-link"><a href="languages.html">Dealing with Human Language</a></span>
»
<span class="breadcrumb-link"><a href="token-normalization.html">Normalizing Tokens</a></span>
»
<span class="breadcrumb-node">Living in a Unicode World</span>
</div>
<div class="navheader">
<span class="prev">
<a href="asciifolding-token-filter.html">« You Have an Accent</a>
</span>
<span class="next">
<a href="case-folding.html">Unicode Case Folding »</a>
</span>
</div>
<div class="section">
<div class="titlepage"><div><div>
<h2 class="title">
<a id="unicode-normalization"></a>Living in a Unicode World<a class="edit_me edit_me_private" rel="nofollow" title="Editing on GitHub is available to Elastic" href="https://github.com/elastic/elasticsearch-definitive-guide/edit/2.x/220_Token_normalization/30_Unicode_world.asciidoc">edit</a>
</h2>
</div></div></div>
<p>When Elasticsearch compares one token with another, it does so at the byte
level. In other words, for two tokens to be considered the same, they need to
consist of exactly the same bytes.  Unicode, however, allows you to write the
same letter in different ways.</p>
<p>For instance, what’s the difference between <em>é</em> and <em>é</em>? It
depends on who you ask. According to Elasticsearch, the first one consists of
the two bytes <code class="literal">0xC3 0xA9</code>, and the second one consists of three bytes, <code class="literal">0x65
0xCC 0x81</code>.</p>
<p>According to Unicode, the differences in how they are represented as bytes is
irrelevant, and they are the same letter. The first one is the single letter
<code class="literal">é</code>, while the second is a plain <code class="literal">e</code> combined with an acute accent <code class="literal">´</code>.</p>
<p>If you get your data from more than one source, it may happen that you have
the same  letters encoded in different ways, which may result in one form of
<code class="literal">déjà</code> not matching another!</p>
<p>Fortunately, a solution is at hand.  There are four Unicode <em>normalization
forms</em>, all of which convert Unicode characters into a standard format, making
all characters comparable at a byte level: <code class="literal">nfc</code>, <code class="literal">nfd</code>, <code class="literal">nfkc</code>, <code class="literal">nfkd</code>.</p>
<div class="sidebar">
<div class="titlepage"><div><div>
<p class="title"><strong>Unicode Normalization Forms</strong></p>
</div></div></div>
<p>The <em>composed</em> forms—<code class="literal">nfc</code> and <code class="literal">nfkc</code>—represent characters in the fewest
bytes possible.  So <code class="literal">é</code> is represented as the single letter <code class="literal">é</code>.  The
<em>decomposed</em> forms—<code class="literal">nfd</code> and <code class="literal">nfkd</code>—represent characters by their
constituent parts, that is <code class="literal">e</code> + <code class="literal">´</code>.</p>
<p>The <em>canonical</em> forms—<code class="literal">nfc</code> and <code class="literal">nfd</code>—represent ligatures like <code class="literal">ﬃ</code> or
<code class="literal">œ</code> as a single character, while the <em>compatibility</em> forms—<code class="literal">nfkc</code> and
<code class="literal">nfkd</code>—break down these composed characters into a simpler multiletter
equivalent: <code class="literal">f</code> + <code class="literal">f</code> + <code class="literal">i</code> or <code class="literal">o</code> + <code class="literal">e</code>.</p>
</div>
<p>It doesn’t really matter which normalization form you choose, as long as all
your text is in the same form.  That way, the same tokens consist of the
same bytes.  That said, the <em>compatibility</em> forms allow you to compare
ligatures like <code class="literal">ﬃ</code> with their simpler representation, <code class="literal">ffi</code>.</p>
<p>You can use the <code class="literal">icu_normalizer</code> token filter to ensure that all of your
tokens are in the same form:</p>
<div class="pre_wrapper lang-js">
<pre class="programlisting prettyprint lang-js">PUT /my_index
{
  "settings": {
    "analysis": {
      "filter": {
        "nfkc_normalizer": { <a id="CO143-1"></a><i class="conum" data-value="1"></i>
          "type": "icu_normalizer",
          "name": "nfkc"
        }
      },
      "analyzer": {
        "my_normalizer": {
          "tokenizer": "icu_tokenizer",
          "filter":  [ "nfkc_normalizer" ]
        }
      }
    }
  }
}</pre>
</div>
<div class="calloutlist">
<table border="0" summary="Callout list">
<tr>
<td align="left" valign="top" width="5%">
<p><a href="#CO143-1"><i class="conum" data-value="1"></i></a></p>
</td>
<td align="left" valign="top">
<p>Normalize all tokens into the <code class="literal">nfkc</code> normalization form.</p>
</td>
</tr>
</table>
</div>
<div class="tip admon">
<div class="icon"></div>
<div class="admon_content">
<p>Besides the <code class="literal">icu_normalizer</code> token filter mentioned previously, there is also an
<code class="literal">icu_normalizer</code> <em>character</em> filter, which does the same job as the token
filter, but does so before the text reaches the tokenizer.  When using the
<code class="literal">standard</code> tokenizer or <code class="literal">icu_tokenizer</code>, this doesn’t really matter.  These
tokenizers know how to deal with all forms of Unicode correctly.</p>
<p>However, if you plan on using a different tokenizer, such as the <code class="literal">ngram</code>,
<code class="literal">edge_ngram</code>, or <code class="literal">pattern</code> tokenizers, it would make sense to use the
<code class="literal">icu_normalizer</code> character filter in preference to the token filter.</p>
</div>
</div>
<p>Usually, though, you will want to not only normalize the byte order of tokens,
but also lowercase them. This can be done with <code class="literal">icu_normalizer</code>, using
the custom normalization form <code class="literal">nfkc_cf</code>, which we discuss in the next section.</p>
</div>
<div class="navfooter">
<span class="prev">
<a href="asciifolding-token-filter.html">« You Have an Accent</a>
</span>
<span class="next">
<a href="case-folding.html">Unicode Case Folding »</a>
</span>
</div>
</div>

                  <!-- end body -->
                </div>
                <div class="col-xs-12 col-sm-4 col-md-4" id="right_col">
                  <div id="rtpcontainer" style="display: block;">
                    <div class="mktg-promo">
                      <h3>Most Popular</h3>
                      <ul class="icons">
                        <li class="icon-elasticsearch-white"><a href="https://www.elastic.co/webinars/getting-started-elasticsearch?baymax=default&amp;elektra=docs&amp;storm=top-video">Get Started with Elasticsearch: Video</a></li>
                        <li class="icon-kibana-white"><a href="https://www.elastic.co/webinars/getting-started-kibana?baymax=default&amp;elektra=docs&amp;storm=top-video">Intro to Kibana: Video</a></li>
                        <li class="icon-logstash-white"><a href="https://www.elastic.co/webinars/introduction-elk-stack?baymax=default&amp;elektra=docs&amp;storm=top-video">ELK for Logs &amp; Metrics: Video</a></li>
                      </ul>
                    </div>
                  </div>
                </div>
              </div>
            </div>
          </section>

        </div>


<div id="elastic-footer"></div>
<script src="https://www.elastic.co/elastic-footer.js"></script>
<!-- Footer Section end-->

      </section>
    </div>

<script src="/guide/static/jquery.js"></script>
<script type="text/javascript" src="/guide/static/docs.js"></script>
<script type="text/javascript">
  window.initial_state = {}</script>
  </body>
</html>
